#!/usr/bin/py2
# -*- coding: utf-8 -*-
#encoding=utf-8

# @author Funsion Wu
from scrapy.exceptions import DropItem
import hashlib

class DuplicatesPipeline(object):

    def __init__(self):
        self.ids_seen = set()

    def md5(self, str):
        m = hashlib.md5()
        m.update(str)
        return m.hexdigest()

    def process_item(self, item, spider):
        source_url_md5 = self.md5(item['source_url'])
        if source_url_md5 in self.ids_seen:
            # 根据source_url指模进行去重
            raise DropItem("Duplicate item found: %s" % item['source_url'])
        else:
            self.ids_seen.add(source_url_md5)
            return item
